This notebook is Tom’s analysis from raw data

source("../CamProt_R/Utility.R")
library(tidyverse)
── Attaching packages ────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
✔ tibble  2.0.0     ✔ purrr   0.2.5
✔ readr   1.3.1     ✔ stringr 1.3.1
✔ tibble  2.0.0     ✔ forcats 0.3.0
── Conflicts ───────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ MSnbase::combine()       masks Biobase::combine(), BiocGenerics::combine(), dplyr::combine()
✖ S4Vectors::expand()      masks tidyr::expand()
✖ dplyr::filter()          masks stats::filter()
✖ S4Vectors::first()       masks dplyr::first()
✖ dplyr::lag()             masks stats::lag()
✖ BiocGenerics::Position() masks ggplot2::Position(), base::Position()
✖ purrr::reduce()          masks MSnbase::reduce()
✖ S4Vectors::rename()      masks dplyr::rename()
infile <- "Input/OOPS_qLOPIT_LabelFree_PeptideGroups_parsed.txt"
samples_inf <- "Input/samples.tsv"
peptides_df <- parse_features(infile, silac=FALSE, TMT=FALSE, level="peptide",
                              filter_crap=TRUE, protein_col='Master.Protein.Accessions')
Tally of features at each stage:
29067   All features
These features are associated with 2731 master proteins
27882   Excluding features without a master protein
These features are associated with 2730 master proteins
26302   Excluding features without a unique master protein
These features are associated with 2405 master proteins
26139   Excluding features matching a cRAP protein
These features are associated with 2396 master proteins
Identified an additional 0 proteins as 'cRAP associated'
colnames(peptides_df)
 [1] "Checked"                           "Confidence"                        "Sequence"                         
 [4] "Modifications"                     "Qvality.PEP"                       "Qvality.q.value"                  
 [7] "Number.of.Protein.Groups"          "Number.of.Proteins"                "Number.of.PSMs"                   
[10] "Master.Protein.Accessions"         "Number.of.Missed.Cleavages"        "Theo.MHplus.in.Da"                
[13] "Found.in.Sample.in.S1.F1.Sample"   "Found.in.Sample.in.S2.F2.Sample"   "Found.in.Sample.in.S3.F3.Sample"  
[16] "Found.in.Sample.in.S4.F4.Sample"   "Found.in.Sample.in.S6.F6.Sample"   "Found.in.Sample.in.S5.F5.Sample"  
[19] "Found.in.Sample.in.S7.F7.Sample"   "Found.in.Sample.in.S8.F8.Sample"   "Found.in.Sample.in.S9.F9.Sample"  
[22] "Found.in.Sample.in.S10.F10.Sample" "Found.in.Sample.in.S11.F11.Sample" "Found.in.Sample.in.S12.F12.Sample"
[25] "Found.in.Sample.in.S13.F13.Sample" "Found.in.Sample.in.S14.F14.Sample" "Found.in.Sample.in.S15.F15.Sample"
[28] "Found.in.Sample.in.S16.F16.Sample" "Found.in.Sample.in.S17.F17.Sample" "Found.in.Sample.in.S18.F18.Sample"
[31] "Found.in.Sample.in.S19.F19.Sample" "Found.in.Sample.in.S20.F20.Sample" "Area.F1.Sample"                   
[34] "Area.F2.Sample"                    "Area.F3.Sample"                    "Area.F4.Sample"                   
[37] "Area.F6.Sample"                    "Area.F5.Sample"                    "Area.F7.Sample"                   
[40] "Area.F8.Sample"                    "Area.F9.Sample"                    "Area.F10.Sample"                  
[43] "Area.F11.Sample"                   "Area.F12.Sample"                   "Area.F13.Sample"                  
[46] "Area.F14.Sample"                   "Area.F15.Sample"                   "Area.F16.Sample"                  
[49] "Area.F17.Sample"                   "Area.F18.Sample"                   "Area.F19.Sample"                  
[52] "Area.F20.Sample"                   "XCorr.Sequest.HT"                  "Confidence.Sequest.HT"            
[55] "Percolator.q.Value.Sequest.HT"     "Percolator.PEP.Sequest.HT"         "master_protein"                   
[58] "protein_length"                    "protein_description"               "peptide_start"                    
[61] "peptide_end"                       "crap_protein"                      "associated_crap_protein"          
[64] "unique"                            "filename"                         
peptides_quant <- makeMSNSet(obj=peptides_df, samples_inf=samples_inf, ab_col_ix=2, level="peptide", quant_name="Area")

tallies for missing data (# samples with missing)
   0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16   17   18   19   20 
  14   32   87   84   88  120  135  144  220  275  322  380  517  666  936 1234 1696 2528 4033 8276 4352 

4352 peptides do not have any quantification values

So lots and lots of missing values! most peptides are quantified in only a very minor subset of fractions (<5/20). This is no suprise since we’re dealing with LFQ and subcellular fractionation here.

plotMissing(peptides_quant)
Out of 21787 total features, 21773 (99.936%) have missing values

   1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16   17   18   19 
  32   87   84   88  120  135  144  220  275  322  380  517  666  936 1234 1696 2528 4033 8276 
And 21741 features have more than one missing value

What about if we subset to GO annotated RBPs?

human_go <- readRDS("./Input/h_sapiens_go_full.rds")
GO_RBPs <- human_go %>% filter(GO.ID=="GO:0003723") %>% pull(UNIPROTKB)
peptides_quant[fData(peptides_quant)$master_protein %in% GO_RBPs,] %>% plotMissing()

Ok, so this doesn’t make any difference, 99.9% have a missing value!

Let’s aggregate to the unique peptide sequence level (ignorning variable modifications) and then see whether that reduces our problem

peptides_no_mod_quant <- agg_to_peptides(peptides_quant)
Your data contains missing values. Please read the relevant section in the combineFeatures manual page for
details the effects of missing values on data aggregation.

Nope, not really! We still have 19304 features (previously 21688) and 99.9% have missing values!

plotMissing(peptides_no_mod_quant)
Out of 19403 total features, 19389 (99.928%) have missing values

   1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16   17   18   19 
  33  100   85   84  129  144  147  220  295  338  385  525  682  911 1202 1647 2397 3582 6483 
And 19356 features have more than one missing value

OK, so we’re going to have to impute. Note that the missing valuesa are particularly present in the first 2 fractions and across fractions 3-7. After that we see fewer missing values. Also, remember that we have yet to identify any definite set of RNAs from the initial fractions (1-7ish). For this reason, we’ll remove these first 5 fractions for now and leave fraction 6 & 7 as these are useful to separate light membranes

plot(colSums(is.na(exprs(peptides_no_mod_quant))))

#peptides_no_mod_quant_no_lm <- peptides_no_mod_quant[,8:20]
peptides_no_mod_quant_no_lm <- peptides_no_mod_quant[,6:20]
plotMissing(peptides_no_mod_quant_no_lm)
Out of 19403 total features, 19258 (99.253%) have missing values

   1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
 121  188  233  267  291  332  478  594  816 1063 1562 2381 3561 6592  779 
And 19137 features have more than one missing value
png("./Output/plots/missing_values_peptide.png")
plotMissing(peptides_no_mod_quant_no_lm)
Out of 19403 total features, 19258 (99.253%) have missing values

   1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
 121  188  233  267  291  332  478  594  816 1063 1562 2381 3561 6592  779 
And 19137 features have more than one missing value
dev.off()
png 
  2 

Ok, so now we’re down to just 99.1% missing :p but at least we have some more peptides now with relatively few (<=4 missing values)

If we focus on those peptides with 4-9 missing values, we can see that many are missing in a block of sequential fractions. Arguably, these are true missing values, e.g not at random.

missing_n <- rowSums(is.na(exprs(peptides_no_mod_quant_no_lm)))
peptides_no_mod_quant_no_lm[(missing_n>=4 & missing_n<=9),] %>% plotMissing()
Out of 2778 total features, 2778 (100%) have missing values

  4   5   6   7   8   9 
267 291 332 478 594 816 
And 2778 features have more than one missing value

We can identify the missing values which are in a sequential block of >=5 fractions in a row and replace these with zero

First, let’s make a function to identify rows of values where the missing values are not random, e.g 4 or more in a row

test_values_1 <- c(1,1,NA,1,NA,NA,1,1,1,1)
test_values_2 <- c(1,1,NA,NA,NA,NA,NA,1,1,1)
test_values_3 <- c(NA,NA,NA,NA,1,1,1,1,1)
is_not_at_random <- function(x){
  with(rle(is.na(x)), sum(lengths[values] >= 4))
}
is_not_at_random(test_values_1)
[1] 0
is_not_at_random(test_values_2)
[1] 1
is_not_at_random(test_values_3)
[1] 1

Now, let’s check this identifies the peptides which are not missing at random. First, we’ll remove those without at least 4 quantification values.

peptides_no_mod_quant_4 <- peptides_no_mod_quant_no_lm[missing_n<=(ncol(peptides_no_mod_quant_no_lm)-4),]
missing_not_at_random <- apply(exprs(peptides_no_mod_quant_4), 1, is_not_at_random)
peptides_no_mod_quant_4[missing_not_at_random==1,] %>% plotMissing()
Out of 3847 total features, 3847 (100%) have missing values

   4    5    6    7    8    9   10   11 
  17   63  128  292  418  692  949 1288 
And 3847 features have more than one missing value

peptides_no_mod_quant_4[missing_not_at_random==2,] %>% plotMissing()
Out of 379 total features, 379 (100%) have missing values

  8   9  10  11 
  6  36  76 261 
And 379 features have more than one missing value

Now, let’s extend the function to replace the blocks of missing values with zero

test_values_1 <- c(1,1,NA,1,NA,NA,1,1,1,1)
test_values_2 <- c(1,1,NA,NA,NA,NA,NA,1,1,1)
test_values_3 <- c(NA,NA,NA,NA,1,1,1,1,1, NA)
rle(is.na(test_values_1))$values
[1] FALSE  TRUE FALSE  TRUE FALSE
rle(is.na(test_values_1))$lengths
[1] 2 1 1 2 4
replace_missing_not_at_random <- function(x){
  missing_rle <- rle(is.na(x))
  
  sequential_blocks <- missing_rle$values
  sequential_blocks[missing_rle$lengths<4] <- FALSE
  replace_with_zero <- rep(sequential_blocks, missing_rle$lengths)
  
  out <- x
  out[replace_with_zero]<-0
  
  return(out)
}
replace_missing_not_at_random(test_values_1)
 [1]  1  1 NA  1 NA NA  1  1  1  1
replace_missing_not_at_random(test_values_2)
 [1] 1 1 0 0 0 0 0 1 1 1
replace_missing_not_at_random(test_values_3)
 [1]  0  0  0  0  1  1  1  1  1 NA

Below we impute a maximum of 10 missing values in sequential blocks with zeros

missing_n2 <- rowSums(is.na(exprs(peptides_no_mod_quant_4)))
peptides_no_mod_quant_4_mnar_zero <- peptides_no_mod_quant_4[missing_n2<=10,]
exprs(peptides_no_mod_quant_4_mnar_zero) <- exprs(peptides_no_mod_quant_4_mnar_zero) %>%
  apply(1, replace_missing_not_at_random) %>% t()

Re-check the missing values

plotMissing(peptides_no_mod_quant_4)
Out of 6090 total features, 5945 (97.619%) have missing values

   1    2    3    4    5    6    7    8    9   10   11 
 121  188  233  267  291  332  478  594  816 1063 1562 
And 5824 features have more than one missing value

plotMissing(peptides_no_mod_quant_4_mnar_zero)
Out of 4528 total features, 3940 (87.014%) have missing values

  1   2   3   4   5   6   7   8   9  10 
639 792 740 586 415 286 186 170  88  38 
And 3301 features have more than one missing value

peptides_no_mod_quant_4_mnar_zero_mar_knn <- peptides_no_mod_quant_4_mnar_zero
imputed_zeros <- rowSums(exprs(peptides_no_mod_quant_4_mnar_zero_mar_knn)==0, na.rm=TRUE)
missing_n3 <- rowSums(is.na(exprs(peptides_no_mod_quant_4_mnar_zero_mar_knn)))
fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$zero_imputation <- imputed_zeros>0 
fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$zero_imputation_n <- imputed_zeros
fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$missing <- missing_n3>0 
fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$missing_n <- missing_n3
peptides_no_mod_quant_4_mnar_zero_mar_knn <- peptides_no_mod_quant_4_mnar_zero_mar_knn[missing_n3<=3,]
print(table(fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$zero_imputation))

FALSE  TRUE 
  687  2072 
print(table(fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$missing))

FALSE  TRUE 
  588  2171 
print(table(fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$missing,
            fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$zero_imputation))
       
        FALSE TRUE
  FALSE   145  443
  TRUE    542 1629
dim(peptides_no_mod_quant_4_mnar_zero)
[1] 4528   15
dim(peptides_no_mod_quant_4_mnar_zero_mar_knn)
[1] 2759   15
peptides_no_mod_quant_4_mnar_zero_mar_knn <- suppressMessages(impute(peptides_no_mod_quant_4_mnar_zero_mar_knn, "knn", k = 10))
Cluster size 2759 broken into 2532 227 
Cluster size 2532 broken into 2182 350 
Cluster size 2182 broken into 1982 200 
Cluster size 1982 broken into 1432 550 
Done cluster 1432 
Done cluster 550 
Done cluster 1982 
Done cluster 200 
Done cluster 2182 
Done cluster 350 
Done cluster 2532 
Done cluster 227 
p <- plotLabelQuant(peptides_no_mod_quant_no_lm, log=TRUE)

p <- plotLabelQuant(peptides_no_mod_quant_4_mnar_zero_mar_knn, log=TRUE)

p <- peptides_no_mod_quant_4_mnar_zero_mar_knn[
  fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$missing,] %>% plotLabelQuant(log=TRUE)

p <- peptides_no_mod_quant_4_mnar_zero_mar_knn[
  !fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$missing,] %>% plotLabelQuant(log=TRUE)

protein_quant <- agg_to_protein(peptides_no_mod_quant_4_mnar_zero_mar_knn)
print(dim(protein_quant))
[1] 739  15
source("~/git_repos/CamProt_R/LOPIT.R")
markers_df <- read.delim("./Input//markers_9B_hyperLOPIT_vs_DC.csv", sep=",", header=FALSE, stringsAsFactors=FALSE)[,1:2]
markers_df$V2 <- recode(markers_df$V2, "RIBOSOME 40S"="RIBOSOME", "RIBOSOME 60S"="RIBOSOME")
markers_proteins <- markers_df$V2
names(markers_proteins) <- markers_df$V1
protein_quant_am <- addMarkers(normalise(protein_quant, "sum"), markers_proteins)
Markers in data: 123 out of 739
organelleMarkers
          CYTOSOL                ER             GOLGI          LYSOSOME      MITOCHONDRIA           NUCLEUS 
                8                21                 4                 2                 5                16 
NUCLEUS-CHROMATIN        PEROXISOME                PM        PROTEASOME          RIBOSOME           unknown 
                6                 1                18                 3                39               616 
p <- plotHexbin(protein_quant_am, "markers")
print(p)

marker_classes <- getMarkerClasses(protein_quant_am, "markers")
m_colours <- getColours(marker_classes)
p <- plotPCA(protein_quant_am, "markers", m_colours=m_colours, re_order_markers=TRUE, marker_levels=organelle_order)
the condition has length > 1 and only the first element will be used
print(p)
$p
ggsave("./Output/plots/pca_1_2_inc_glycoproteins.png")
Saving 7.29 x 4.5 in image

p <- plotPCA(protein_quant_am, "markers", m_colours=m_colours, re_order_markers=TRUE, marker_levels=organelle_order, dims=c(3,4))
the condition has length > 1 and only the first element will be used
print(p)
$p

glycoproteins <- read.delim("./Input/glycoproteins.tsv")$protein

Copied from Manasa’s oopsFinal.Rmd notebook

# We will add a bit more information to the fData file 
# 1. List of known RBPs across cell lines in the XRNAX paper (Table S2)
xrnax = read.delim("./Input/xrnax-genelist.txt",sep="\t",header=T)
xrnax.rbps = xrnax %>% 
              dplyr::filter(!is.na(MCF7.RBP) | !is.na(HEK293.RBP) | !is.na(HeLa.RBP)) %>% 
              dplyr::select(Uniprot.ID:Protein.name)
rownames(xrnax.rbps) = xrnax.rbps$Uniprot.ID
print(length(rownames(xrnax.rbps)))
[1] 1753
# Check how many are common to the cell lines in the XRNAX paper
xrnax %>% 
  dplyr::select(MCF7.RBP:ihRBP) %>%
  apply(2, table,useNA="always")
            MCF7.RBP HEK293.RBP HeLa.RBP ihRBP
non-poly(A)      617        698      565   775
poly(A)          590        659      674   978
<NA>            1276       1126     1244   730
# 2. List of RBPs from SILAC experiments using OOPS after wash step2 (Table S1)
oops = read.delim("./Input/oops-genelist.txt",sep="\t",header=T)
oops.rbps = oops %>% 
              dplyr::filter(CL_NC_Ratio >= 1.0) %>% 
              dplyr::select(master_protein, RBP_glyco)
oops_rbps <- unique(oops.rbps$master_protein)
print(length(oops_rbps))
[1] 405
protein_quant_am_no_glyco <- protein_quant_am[!rownames(protein_quant_am) %in% glycoproteins,]
fData(protein_quant_am_no_glyco)$oops <- rownames(protein_quant_am_no_glyco) %in% oops_rbps
fData(protein_quant_am_no_glyco)$xrnax <- rownames(protein_quant_am_no_glyco) %in% rownames(xrnax.rbps)
fData(protein_quant_am_no_glyco)$go_rbp <- rownames(protein_quant_am_no_glyco) %in% GO_RBPs
print(table(fData(protein_quant_am_no_glyco)$oops, fData(protein_quant_am_no_glyco)$xrnax))
       
        FALSE TRUE
  FALSE   176  202
  TRUE     21  239
print(table(fData(protein_quant_am_no_glyco)$oops, fData(protein_quant_am_no_glyco)$xrnax,
            fData(protein_quant_am_no_glyco)$go_rbp))
, ,  = FALSE

       
        FALSE TRUE
  FALSE   149  104
  TRUE     16   31

, ,  = TRUE

       
        FALSE TRUE
  FALSE    27   98
  TRUE      5  208
print(dim(protein_quant_am))
[1] 739  15
print(dim(protein_quant_am_no_glyco))
[1] 638  15
marker_classes <- getMarkerClasses(protein_quant_am_no_glyco, "markers")
m_colours <- getColours(marker_classes)
p <- plotPCA(protein_quant_am_no_glyco, "markers", m_colours=m_colours, re_order_markers=TRUE, marker_levels=organelle_order)
the condition has length > 1 and only the first element will be used
print(p)
$p
ggsave("./Output/plots/pca_1_2.png")
Saving 7.29 x 4.5 in image

p <- plotPCA(protein_quant_am_no_glyco, "markers", m_colours=m_colours, re_order_markers=TRUE, marker_levels=organelle_order, dims=c(3,4))
the condition has length > 1 and only the first element will be used
print(p)
$p
ggsave("./Output/plots/pca_3_4.png")
Saving 7.29 x 4.5 in image

protein_quant_am_no_glyco_yes_rbp <- protein_quant_am_no_glyco[
  (fData(protein_quant_am_no_glyco)$oops |
     fData(protein_quant_am_no_glyco)$xrnax |
    fData(protein_quant_am_no_glyco)$go_rbp),]
p <- plotPCA(protein_quant_am_no_glyco_yes_rbp, "markers", m_colours=m_colours, re_order_markers=TRUE, marker_levels=organelle_order)
the condition has length > 1 and only the first element will be used
print(p)
$p
ggsave("./Output/plots/pca_1_2_no_glyco_only_rbps.png")
Saving 7.29 x 4.5 in image

p <- plotPCA(protein_quant_am_no_glyco_yes_rbp, "markers", m_colours=m_colours, re_order_markers=TRUE, marker_levels=organelle_order, dims=c(3,4))
the condition has length > 1 and only the first element will be used
print(p)
$p
ggsave("./Output/plots/pca_3_4_no_glyco_only_rbps.png")
Saving 7.29 x 4.5 in image

translocon <- human_go %>% filter(GO.ID=='GO:0071256') %>% pull(UNIPROTKB)
para <- human_go %>% filter(GO.ID=='GO:0042382') %>% pull(UNIPROTKB)
 
p <- plotPCA(protein_quant_am_no_glyco_yes_rbp, "markers", m_colours=m_colours, re_order_markers=TRUE, marker_levels=organelle_order, foi=translocon)
the condition has length > 1 and only the first element will be used
print(p)
$p

$p_foi

ggsave("./Output/plots/pca_1_2_no_glyco_only_rbps_translocon.png")
Saving 7.29 x 4.5 in image

print(dim(protein_quant_am_no_glyco_yes_rbp))
[1] 489  15
p <- plotPCA(protein_quant_am_no_glyco_yes_rbp, "markers", m_colours=m_colours, re_order_markers=TRUE, marker_levels=organelle_order, foi=para)
the condition has length > 1 and only the first element will be used
print(p)
$p

$p_foi

ggsave("./Output/plots/pca_1_2_no_glyco_only_rbps_paraspeckles.png")
Saving 7.29 x 4.5 in image

p <- PlotMarkerProfiles(protein_quant_am_no_glyco_yes_rbp, "markers", keep_markers=marker_classes, organelle_order=organelle_order) +
  facet_wrap(~markers) +
  scale_colour_manual(values=m_colours, guide=FALSE) +
  theme(legend.position="none")
Scale for 'colour' is already present. Adding another scale for 'colour', which will replace the existing scale.
print(p)

ggsave("./Output/plots/marker_profiles.png")
Saving 10 x 10 in image
p <- PlotMarkerProfiles(protein_quant_am_no_glyco_yes_rbp, "markers", keep_markers=marker_classes,
                        organelle_order=organelle_order, unknown=TRUE) +
  facet_grid(zero_imputation_n~missing_n) +
  scale_colour_manual(values=m_colours, guide=FALSE) +
  theme(legend.position="none")
Scale for 'colour' is already present. Adding another scale for 'colour', which will replace the existing scale.
print(p)

ggsave("./Output/plots/marker_profiles_imputation.png")
Saving 10 x 10 in image

Now we make a new set of markers designed to highlight functional groups of RBPs more usefully. First we need to define new sets of GO annotation proteins, where each marker belongs to only one group

translocon <- human_go %>% filter(GO.ID=='GO:0071256') %>% pull(UNIPROTKB)
para <- human_go %>% filter(GO.ID=='GO:0042382') %>% pull(UNIPROTKB)
mrna_splicing <- human_go %>% filter(GO.ID=='GO:0000398') %>% pull(UNIPROTKB)
translation_init <- human_go %>% filter(GO.ID=='GO:0006413') %>% pull(UNIPROTKB)
translation_init <- setdiff(translation_init, names(markers_proteins)[markers_proteins=="RIBOSOME"])
cell_cell_adhesion <- human_go %>% filter(GO.ID=='GO:0098609') %>% pull(UNIPROTKB)
cytoskeleton <- human_go %>% filter(GO.ID=='GO:0005856') %>% pull(UNIPROTKB)
motor_activity <- human_go %>% filter(GO.ID=='GO:0003774') %>% pull(UNIPROTKB)
er_stress_response <- human_go %>% filter(GO.ID=='GO:0030968') %>% pull(UNIPROTKB)
nuclear_pore_channel <- human_go %>% filter(GO.ID=='GO:0044613') %>% pull(UNIPROTKB)
nuclear_pore_basket <- human_go %>% filter(GO.ID=='GO:0044615') %>% pull(UNIPROTKB)
tRNA_AA <- human_go %>% filter(GO.ID=='GO:0004812') %>% pull(UNIPROTKB)
#mrna_splicing <- setdiff(mrna_splicing, c(para, translation_init, cell_cell_adhesion, cytoskeleton, motor_activity))
#translation_init <- setdiff(translation_init, c(para, cell_cell_adhesion, cytoskeleton, motor_activity))
#cytoskeleton <- setdiff(cytoskeleton, c(para, cell_cell_adhesion, motor_activity))
#cell_cell_adhesion <- setdiff(cell_cell_adhesion, c(para, motor_activity))
#all_markers <- c(mrna_splicing, para, translocon, translation_init, cell_cell_adhesion, cytoskeleton, motor_activity)
#print(length(all_markers)==length(unique(all_markers)))
mrna_splicing <- setdiff(mrna_splicing, tRNA_AA)
all_markers <- c(mrna_splicing, tRNA_AA)
print(length(all_markers)==length(unique(all_markers)))
[1] TRUE
rbps_markers <- markers_proteins
localisations_to_remove <- c("PEROXISOME", "PROTEASOME", "GOLGI", "LYSOSOME")
rbps_markers <- rbps_markers[!rbps_markers %in% localisations_to_remove]
rbps_markers[rbps_markers =="NUCLEUS-CHROMATIN"] <- "NUCLEUS"
print(table(rbps_markers))
rbps_markers
     CYTOSOL           ER MITOCHONDRIA      NUCLEUS           PM     RIBOSOME 
          96          118          197          171          295           72 
new_markers <- c(#rep("PARASPECKLES", length(para)),
                 rep("mRNA splicing", length(mrna_splicing)),
                 rep("Aminoacyl-tRNA ligase", length(tRNA_AA)),
                 "PARP1"#,
                 #rep("TRANSLATION INITITAION", length(translation_init)),
                 #rep("CELL-CELL ADHESION", length(cell_cell_adhesion)),
                 #rep("MOTOR", length(motor_activity))#,
                 #rep("CYTOSKELETON", length(cytoskeleton))
                 )
names(new_markers) <- c(#para,
                        mrna_splicing,
                        tRNA_AA,
                        "P09874"#,
                        #translation_init,
                        #cell_cell_adhesion,
                        #motor_activity#,
                        #cytoskeleton
                        )
print(table(names(new_markers))[table(names(new_markers))>1])
named integer(0)
rbps_markers <- rbps_markers[!names(rbps_markers) %in% names(new_markers)]
combined_markers <- c(rbps_markers, new_markers)
print(table(combined_markers))
combined_markers
Aminoacyl-tRNA ligase               CYTOSOL                    ER          MITOCHONDRIA         mRNA splicing 
                   42                    93                   118                   193                   321 
              NUCLEUS                 PARP1                    PM              RIBOSOME 
                  149                     1                   295                    72 
print(table(names(combined_markers))[table(names(combined_markers))>1])
named integer(0)
fData(protein_quant_am_no_glyco_yes_rbp)$new_markers <- NULL
protein_quant_am_no_glyco_yes_rbp <- addMarkers(protein_quant_am_no_glyco_yes_rbp, combined_markers, "new_markers")
Markers in data: 168 out of 489
organelleMarkers
Aminoacyl-tRNA ligase               CYTOSOL                    ER          MITOCHONDRIA         mRNA splicing 
                   13                     5                     7                     2                    86 
              NUCLEUS                 PARP1                    PM              RIBOSOME               unknown 
                   12                     1                     3                    39                   321 
fData(protein_quant_am_no_glyco_yes_rbp)$new_markers <- recode(
  fData(protein_quant_am_no_glyco_yes_rbp)$new_markers, "NUCLEUS"="Nucleus", "RIBOSOME"="Ribosome", "CYTOSOL"="Cytosol",
  "MITOCHONDRIA"="Mitochondria") 
print(table(fData(protein_quant_am_no_glyco_yes_rbp)$new_markers))

Aminoacyl-tRNA ligase               Cytosol                    ER          Mitochondria         mRNA splicing 
                   13                     5                     7                     2                    86 
              Nucleus                 PARP1                    PM              Ribosome               unknown 
                   12                     1                     3                    39                   321 

After adding these new RBP markers, we only have 1 PM and 2 Mt proteins remaining. Let’s remove the PM protein

fData(protein_quant_am_no_glyco_yes_rbp)$new_markers[fData(protein_quant_am_no_glyco_yes_rbp)$new_markers=="PM"] <- "unknown"
new_markers_levels <- getMarkerClasses(protein_quant_am_no_glyco_yes_rbp, "new_markers")
p <- plotPCA(protein_quant_am_no_glyco_yes_rbp, "new_markers", re_order_markers=TRUE, marker_levels=new_markers_levels,
        m_colours=getClassColours()[c(1:5, 7, 10:20)], foi=nuclear_pore_channel)
the condition has length > 1 and only the first element will be used
print(p)
$p

$p_foi

p <- plotPCA(protein_quant_am_no_glyco_yes_rbp, "new_markers", re_order_markers=TRUE, marker_levels=new_markers_levels,
        m_colours=getClassColours()[c(1:5, 7, 10:20)], foi=nuclear_pore_basket)
the condition has length > 1 and only the first element will be used
print(p)
$p

$p_foi

getMarkerClasses(protein_quant_am_no_glyco_yes_rbp, "new_markers")
[1] "Aminoacyl-tRNA ligase" "Cytosol"               "ER"                    "Mitochondria"         
[5] "mRNA splicing"         "Nucleus"               "PARP1"                 "Ribosome"             
set.seed(1)
proj <- make_proj("t-SNE", protein_quant_am_no_glyco_yes_rbp, "new_markers")
Loading required namespace: Rtsne
library(Hmisc)
Loading required package: lattice
Loading required package: survival

Attaching package: ‘survival’

The following object is masked from ‘package:robustbase’:

    heart

Loading required package: Formula

Attaching package: ‘Hmisc’

The following object is masked from ‘package:AnnotationDbi’:

    contents

The following objects are masked from ‘package:MSnbase’:

    impute, naplot

The following object is masked from ‘package:Biobase’:

    contents

The following objects are masked from ‘package:dplyr’:

    src, summarize

The following objects are masked from ‘package:base’:

    format.pval, units
proj_df <- proj$PCA_df
marker_levels <- setdiff(new_markers_levels, "unknown")
marker_levels <- marker_levels[c(2,3,4,6,8,1,5,7)]
#m_colours <- getStockcol()[c(1,7,4,3,2,5)]
m_colours <- c(cbPalette[c(6,3,2,4,8,7,5)], "grey20")
proj_df$markers <- factor(proj_df$new_markers, levels=marker_levels)
print(table(is.na(proj_df$markers)))

FALSE  TRUE 
  165   324 
proj_df$unknown <- proj_df$new_markers=="unknown"
p <- ggplot(proj_df, aes(X, Y, colour=markers)) +
    geom_point(aes(X, Y, colour=markers, alpha=unknown), size=2) +
    scale_alpha_manual(values=c(1,0.2), guide=F) +
    my_theme +
    scale_colour_manual(values=m_colours, na.value="grey60", name="", breaks=c(marker_levels)) +
    guides(colour = guide_legend(override.aes = list(alpha = 1, size=2))) +
    xlab("Dimension 1") + ylab("Dimension 2")
print(p)
ggsave("./Output/tSNE_RBP_markers.png")
Saving 7.29 x 4.5 in image

write.table(proj_df, "./Output/tSNE_projections.tsv", sep="\t", quote=FALSE, row.name=FALSE)
p <- PlotMarkerProfiles(protein_quant_am_no_glyco_yes_rbp, "new_markers",
                        keep_markers=getMarkerClasses(protein_quant_am_no_glyco_yes_rbp, "new_markers"), organelle_order=new_markers_levels) +
  facet_wrap(~markers) +
  scale_colour_manual(values=getClassColours()[c(1:5, 7, 10:20)], guide=FALSE) +
  theme(legend.position="none")
Scale for 'colour' is already present. Adding another scale for 'colour', which will replace the existing scale.
print(p)

protein_info <- read.delim("Input/Aggregated-proteins-2187-with-uniprot.tab") %>%
  dplyr::select(Entry, gene_name=Gene.names...primary..)
total.prot = readRDS("./Input/prot_res_20_fractions_imputed_markers.rds")
colnames(total.prot)[12] <- "0.948"
colnames(total.prot) <- c(seq(1,20,2), seq(2,20,2))
total.prot <- total.prot[,order(as.numeric(as.character(colnames(total.prot))))]
total.prot <- total.prot[,5:19]
total.prot = normalise(total.prot,"sum")
rbp.prot <- protein_quant_am_no_glyco_yes_rbp
colnames(rbp.prot) <- 5:19
print(dim(total.prot))
[1] 5610   15
print(dim(rbp.prot))
[1] 489  15
print(colnames(total.prot))
 [1] "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13" "14" "15" "16" "17" "18" "19"
print(colnames(rbp.prot))
 [1] "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13" "14" "15" "16" "17" "18" "19"
plotCombinedProfiles <- function(foi){
  
  foi_in_total <- intersect(foi, rownames(total.prot))
  foi_in_rbp <- intersect(foi, rownames(rbp.prot))
  
  foi_in_both <- intersect(foi_in_rbp, foi_in_total)
  
  print(foi_in_both)
  if(length(foi_in_both)==0){
    return(NA)
  }
  
  total_exprs_df <- exprs(total.prot[foi_in_both,])
  total_exprs_df <- melt(total_exprs_df)
  total_exprs_df$type = "All protein"
  
  rbp_exprs_df <- exprs(rbp.prot[foi_in_both,])
  rbp_exprs_df <- melt(rbp_exprs_df)
  rbp_exprs_df$type = "RNA-bound"
  
  #print(head(total_exprs_df))
  #print(head(rbp_exprs_df))
  #print(head(rbind(rbp_exprs_df, total_exprs_df)))
  
  p <- rbind(rbp_exprs_df, total_exprs_df) %>%
    merge(protein_info, by.x="Var1", by.y="Entry") %>%
    ggplot(aes(Var2, value, colour=type, group=type)) +
    my_theme + geom_line() +
    theme(aspect.ratio=1, axis.text.x=element_text(angle=90, vjust=0.5, hjust=1))  +
    xlab("Fraction") + ylab("Sumn normalised\nabundance") + 
    scale_colour_discrete(name="", na.value="grey")
  
  if(length(foi_in_both)>1){
    p <- p + facet_wrap(~gene_name)
  }
  
  print(p)
  
  p2 <- p <- rbind(rbp_exprs_df, total_exprs_df) %>%
    merge(protein_info, by.x="Var1", by.y="Entry") %>%
    ggplot(aes(Var2, value, colour=type, group=interaction(type, gene_name))) +
    my_theme + geom_line() +
    theme(aspect.ratio=1, axis.text.x=element_text(angle=90, vjust=0.5, hjust=1))  +
    xlab("Fraction") + ylab("Sumn normalised\nabundance") + 
    scale_colour_discrete(name="", na.value="grey")
  
  print(p2)
  
  invisible(list("p"=p, "p2"=p2))
}
p <- plotCombinedProfiles(para)
[1] "P23246" "Q15233" "P52272"

ggsave("Output/plots/paraspeckle.png", p$p2)
Saving 7.29 x 4.5 in image

plotCombinedProfiles(tRNA_AA)
 [1] "Q9NSE4" "P41252" "P54577" "P49589" "P49591" "P14868" "O43776" "P07814" "P47897" "P26639" "P26640" "P49588"
[13] "P41250"

plotCombinedProfiles(motor_activity)
[1] "Q14203" "P33176" "O00159" "Q14204" "P35579" "P52732" "Q13409" "P35580" "P60660"

fvarLabels(rbp.prot)
 [1] "Checked"                       "Confidence"                    "Sequence"                     
 [4] "Modifications"                 "Qvality.PEP"                   "Qvality.q.value"              
 [7] "Number.of.Protein.Groups"      "Number.of.Proteins"            "Number.of.PSMs"               
[10] "Master.Protein.Accessions"     "Number.of.Missed.Cleavages"    "Theo.MHplus.in.Da"            
[13] "XCorr.Sequest.HT"              "Confidence.Sequest.HT"         "Percolator.q.Value.Sequest.HT"
[16] "Percolator.PEP.Sequest.HT"     "master_protein"                "protein_length"               
[19] "protein_description"           "peptide_start"                 "peptide_end"                  
[22] "crap_protein"                  "associated_crap_protein"       "unique"                       
[25] "filename"                      "zero_imputation"               "zero_imputation_n"            
[28] "missing"                       "missing_n"                     "CV.F6"                        
[31] "CV.F7"                         "CV.F8"                         "CV.F9"                        
[34] "CV.F10"                        "CV.F11"                        "CV.F12"                       
[37] "CV.F13"                        "CV.F14"                        "CV.F15"                       
[40] "CV.F16"                        "CV.F17"                        "CV.F18"                       
[43] "CV.F19"                        "CV.F20"                        "markers"                      
[46] "oops"                          "xrnax"                         "go_rbp"                       
[49] "new_markers"                  
well_quantified_rbps <- fData(rbp.prot) %>%
  filter(zero_imputation_n<=4, missing_n<=2) %>%
  pull(master_protein)
plotCombinedProfiles(intersect(mrna_splicing, well_quantified_rbps))
[1] "Q01081" "Q00839" "P67809" "P11142" "O43290" "Q13247" "Q15366" "Q8IYB3"

ribosome_proteins <- names(markers_proteins)[markers_proteins=="RIBOSOME"]
plotCombinedProfiles(intersect(ribosome_proteins, well_quantified_rbps))
[1] "P15880" "P46781" "P35268" "P62750" "Q02543" "P36578" "P47914"

plotCombinedProfiles("Q15942")
[1] "Q15942"

p <- plotCombinedProfiles("P09874")

ggsave("./Output/PARP1_profiles.png", p$p2)
---
title: "R Notebook"
output: html_notebook
---

This notebook is Tom's analysis from raw data

```{r}
source("../CamProt_R/Utility.R")
library(tidyverse)
infile <- "Input/OOPS_qLOPIT_LabelFree_PeptideGroups_parsed.txt"
samples_inf <- "Input/samples.tsv"
```

```{r}
peptides_df <- parse_features(infile, silac=FALSE, TMT=FALSE, level="peptide",
                              filter_crap=TRUE, protein_col='Master.Protein.Accessions')
```

```{r}
colnames(peptides_df)

peptides_quant <- makeMSNSet(obj=peptides_df, samples_inf=samples_inf, ab_col_ix=2, level="peptide", quant_name="Area")

```
So lots and lots of missing values! most peptides are quantified in only a very minor subset of fractions (<5/20). This is no suprise since we're dealing with LFQ and subcellular fractionation here.
```{r}
plotMissing(peptides_quant)
```
What about if we subset to GO annotated RBPs?
```{r}
human_go <- readRDS("./Input/h_sapiens_go_full.rds")
GO_RBPs <- human_go %>% filter(GO.ID=="GO:0003723") %>% pull(UNIPROTKB)
```

```{r}
peptides_quant[fData(peptides_quant)$master_protein %in% GO_RBPs,] %>% plotMissing()

```
Ok, so this doesn't make any difference, 99.9% have a missing value!

Let's aggregate to the unique peptide sequence level (ignorning variable modifications) and then see whether that reduces our problem

```{r}
peptides_no_mod_quant <- agg_to_peptides(peptides_quant)
```

Nope, not really! We still have 19304 features (previously 21688) and 99.9% have missing values!
```{r}
plotMissing(peptides_no_mod_quant)
```

OK, so we're going to have to impute. Note that the missing valuesa are particularly present in the first 2 fractions and across fractions 3-7. After that we see fewer missing values. Also, remember that we have yet to identify any definite set of RNAs from the initial fractions (1-7ish). For this reason, we'll remove these first 5 fractions for now and leave fraction 6 & 7 as these are useful to separate light membranes
```{r}
plot(colSums(is.na(exprs(peptides_no_mod_quant))))
```


```{r}
#peptides_no_mod_quant_no_lm <- peptides_no_mod_quant[,8:20]
peptides_no_mod_quant_no_lm <- peptides_no_mod_quant[,6:20]
plotMissing(peptides_no_mod_quant_no_lm)

png("./Output/plots/missing_values_peptide.png")
plotMissing(peptides_no_mod_quant_no_lm)
dev.off()

```
Ok, so now we're down to _just_ 99.1% missing :p but at least we have some more peptides now with relatively few (<=4 missing values)

If we focus on those peptides with 4-9 missing values, we can see that many are missing in a block of sequential fractions. Arguably, these are true missing values, e.g not at random. 
```{r}
missing_n <- rowSums(is.na(exprs(peptides_no_mod_quant_no_lm)))
peptides_no_mod_quant_no_lm[(missing_n>=4 & missing_n<=9),] %>% plotMissing()
```
We can identify the missing values which are in a sequential block of >=5 fractions in a row and replace these with zero

First, let's make a function to identify rows of values where the missing values are not random, e.g 4 or more in a row
```{r}
test_values_1 <- c(1,1,NA,1,NA,NA,1,1,1,1)
test_values_2 <- c(1,1,NA,NA,NA,NA,NA,1,1,1)
test_values_3 <- c(NA,NA,NA,NA,1,1,1,1,1)

is_not_at_random <- function(x){
  with(rle(is.na(x)), sum(lengths[values] >= 4))
}

is_not_at_random(test_values_1)
is_not_at_random(test_values_2)
is_not_at_random(test_values_3)
```

Now, let's check this identifies the peptides which are not missing at random. First, we'll remove those without at least 4 quantification values.
```{r}
peptides_no_mod_quant_4 <- peptides_no_mod_quant_no_lm[missing_n<=(ncol(peptides_no_mod_quant_no_lm)-4),]
missing_not_at_random <- apply(exprs(peptides_no_mod_quant_4), 1, is_not_at_random)
```

```{r}
peptides_no_mod_quant_4[missing_not_at_random==1,] %>% plotMissing()
peptides_no_mod_quant_4[missing_not_at_random==2,] %>% plotMissing()
```
Now, let's extend the function to replace the blocks of missing values with zero
```{r}
test_values_1 <- c(1,1,NA,1,NA,NA,1,1,1,1)
test_values_2 <- c(1,1,NA,NA,NA,NA,NA,1,1,1)
test_values_3 <- c(NA,NA,NA,NA,1,1,1,1,1, NA)


rle(is.na(test_values_1))$values
rle(is.na(test_values_1))$lengths

replace_missing_not_at_random <- function(x){
  missing_rle <- rle(is.na(x))
  
  sequential_blocks <- missing_rle$values
  sequential_blocks[missing_rle$lengths<4] <- FALSE

  replace_with_zero <- rep(sequential_blocks, missing_rle$lengths)
  
  out <- x
  out[replace_with_zero]<-0
  
  return(out)

}

replace_missing_not_at_random(test_values_1)
replace_missing_not_at_random(test_values_2)
replace_missing_not_at_random(test_values_3)
```


Below we impute a maximum of 10 missing values in sequential blocks with zeros
```{r}
missing_n2 <- rowSums(is.na(exprs(peptides_no_mod_quant_4)))
peptides_no_mod_quant_4_mnar_zero <- peptides_no_mod_quant_4[missing_n2<=10,]

exprs(peptides_no_mod_quant_4_mnar_zero) <- exprs(peptides_no_mod_quant_4_mnar_zero) %>%
  apply(1, replace_missing_not_at_random) %>% t()
```

Re-check the missing values
```{r}
plotMissing(peptides_no_mod_quant_4)
plotMissing(peptides_no_mod_quant_4_mnar_zero)
```

```{r}
peptides_no_mod_quant_4_mnar_zero_mar_knn <- peptides_no_mod_quant_4_mnar_zero

imputed_zeros <- rowSums(exprs(peptides_no_mod_quant_4_mnar_zero_mar_knn)==0, na.rm=TRUE)
missing_n3 <- rowSums(is.na(exprs(peptides_no_mod_quant_4_mnar_zero_mar_knn)))

fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$zero_imputation <- imputed_zeros>0 
fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$zero_imputation_n <- imputed_zeros

fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$missing <- missing_n3>0 
fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$missing_n <- missing_n3


peptides_no_mod_quant_4_mnar_zero_mar_knn <- peptides_no_mod_quant_4_mnar_zero_mar_knn[missing_n3<=3,]

print(table(fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$zero_imputation))
print(table(fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$missing))

print(table(fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$missing,
            fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$zero_imputation))

dim(peptides_no_mod_quant_4_mnar_zero)
dim(peptides_no_mod_quant_4_mnar_zero_mar_knn)
peptides_no_mod_quant_4_mnar_zero_mar_knn <- suppressMessages(impute(peptides_no_mod_quant_4_mnar_zero_mar_knn, "knn", k = 10))
```

```{r}
p <- plotLabelQuant(peptides_no_mod_quant_no_lm, log=TRUE)
p <- plotLabelQuant(peptides_no_mod_quant_4_mnar_zero_mar_knn, log=TRUE)
```

```{r}
p <- peptides_no_mod_quant_4_mnar_zero_mar_knn[
  fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$missing,] %>% plotLabelQuant(log=TRUE)
p <- peptides_no_mod_quant_4_mnar_zero_mar_knn[
  !fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$missing,] %>% plotLabelQuant(log=TRUE)
```

```{r}
protein_quant <- agg_to_protein(peptides_no_mod_quant_4_mnar_zero_mar_knn)
print(dim(protein_quant))
```

```{r}
source("~/git_repos/CamProt_R/LOPIT.R")
```

```{r}
markers_df <- read.delim("./Input//markers_9B_hyperLOPIT_vs_DC.csv", sep=",", header=FALSE, stringsAsFactors=FALSE)[,1:2]
markers_df$V2 <- recode(markers_df$V2, "RIBOSOME 40S"="RIBOSOME", "RIBOSOME 60S"="RIBOSOME")
markers_proteins <- markers_df$V2
names(markers_proteins) <- markers_df$V1

protein_quant_am <- addMarkers(normalise(protein_quant, "sum"), markers_proteins)

p <- plotHexbin(protein_quant_am, "markers")
print(p)
```


```{r}

marker_classes <- getMarkerClasses(protein_quant_am, "markers")
m_colours <- getColours(marker_classes)
p <- plotPCA(protein_quant_am, "markers", m_colours=m_colours, re_order_markers=TRUE, marker_levels=organelle_order)
print(p)
ggsave("./Output/plots/pca_1_2_inc_glycoproteins.png")

p <- plotPCA(protein_quant_am, "markers", m_colours=m_colours, re_order_markers=TRUE, marker_levels=organelle_order, dims=c(3,4))
print(p)
```

```{r}
glycoproteins <- read.delim("./Input/glycoproteins.tsv")$protein
```

Copied from Manasa's oopsFinal.Rmd notebook
```{r}
# We will add a bit more information to the fData file 
# 1. List of known RBPs across cell lines in the XRNAX paper (Table S2)
xrnax = read.delim("./Input/xrnax-genelist.txt",sep="\t",header=T)
xrnax.rbps = xrnax %>% 
              dplyr::filter(!is.na(MCF7.RBP) | !is.na(HEK293.RBP) | !is.na(HeLa.RBP)) %>% 
              dplyr::select(Uniprot.ID:Protein.name)
rownames(xrnax.rbps) = xrnax.rbps$Uniprot.ID
print(length(rownames(xrnax.rbps)))

# Check how many are common to the cell lines in the XRNAX paper
xrnax %>% 
  dplyr::select(MCF7.RBP:ihRBP) %>%
  apply(2, table,useNA="always")

# 2. List of RBPs from SILAC experiments using OOPS after wash step2 (Table S1)
oops = read.delim("./Input/oops-genelist.txt",sep="\t",header=T)
oops.rbps = oops %>% 
              dplyr::filter(CL_NC_Ratio >= 1.0) %>% 
              dplyr::select(master_protein, RBP_glyco)

oops_rbps <- unique(oops.rbps$master_protein)
print(length(oops_rbps))
```

```{r}
protein_quant_am_no_glyco <- protein_quant_am[!rownames(protein_quant_am) %in% glycoproteins,]
fData(protein_quant_am_no_glyco)$oops <- rownames(protein_quant_am_no_glyco) %in% oops_rbps
fData(protein_quant_am_no_glyco)$xrnax <- rownames(protein_quant_am_no_glyco) %in% rownames(xrnax.rbps)
fData(protein_quant_am_no_glyco)$go_rbp <- rownames(protein_quant_am_no_glyco) %in% GO_RBPs

print(table(fData(protein_quant_am_no_glyco)$oops, fData(protein_quant_am_no_glyco)$xrnax))
print(table(fData(protein_quant_am_no_glyco)$oops, fData(protein_quant_am_no_glyco)$xrnax,
            fData(protein_quant_am_no_glyco)$go_rbp))

print(dim(protein_quant_am))
print(dim(protein_quant_am_no_glyco))
```
```{r}
marker_classes <- getMarkerClasses(protein_quant_am_no_glyco, "markers")
m_colours <- getColours(marker_classes)
p <- plotPCA(protein_quant_am_no_glyco, "markers", m_colours=m_colours, re_order_markers=TRUE, marker_levels=organelle_order)
print(p)
ggsave("./Output/plots/pca_1_2.png")

p <- plotPCA(protein_quant_am_no_glyco, "markers", m_colours=m_colours, re_order_markers=TRUE, marker_levels=organelle_order, dims=c(3,4))
print(p)
ggsave("./Output/plots/pca_3_4.png")
```
```{r}
protein_quant_am_no_glyco_yes_rbp <- protein_quant_am_no_glyco[
  (fData(protein_quant_am_no_glyco)$oops |
     fData(protein_quant_am_no_glyco)$xrnax |
    fData(protein_quant_am_no_glyco)$go_rbp),]

p <- plotPCA(protein_quant_am_no_glyco_yes_rbp, "markers", m_colours=m_colours, re_order_markers=TRUE, marker_levels=organelle_order)
print(p)
ggsave("./Output/plots/pca_1_2_no_glyco_only_rbps.png")


p <- plotPCA(protein_quant_am_no_glyco_yes_rbp, "markers", m_colours=m_colours, re_order_markers=TRUE, marker_levels=organelle_order, dims=c(3,4))
print(p)
ggsave("./Output/plots/pca_3_4_no_glyco_only_rbps.png")
```

```{r}
translocon <- human_go %>% filter(GO.ID=='GO:0071256') %>% pull(UNIPROTKB)
para <- human_go %>% filter(GO.ID=='GO:0042382') %>% pull(UNIPROTKB)

 
```

```{r}
p <- plotPCA(protein_quant_am_no_glyco_yes_rbp, "markers", m_colours=m_colours, re_order_markers=TRUE, marker_levels=organelle_order, foi=translocon)
print(p)
ggsave("./Output/plots/pca_1_2_no_glyco_only_rbps_translocon.png")
```
```{r}
print(dim(protein_quant_am_no_glyco_yes_rbp))
```

```{r}
p <- plotPCA(protein_quant_am_no_glyco_yes_rbp, "markers", m_colours=m_colours, re_order_markers=TRUE, marker_levels=organelle_order, foi=para)
print(p)
ggsave("./Output/plots/pca_1_2_no_glyco_only_rbps_paraspeckles.png")
```



```{r, fig.width=10, fig.height=10}
p <- PlotMarkerProfiles(protein_quant_am_no_glyco_yes_rbp, "markers", keep_markers=marker_classes, organelle_order=organelle_order) +
  facet_wrap(~markers) +
  scale_colour_manual(values=m_colours, guide=FALSE) +
  theme(legend.position="none")
print(p)
ggsave("./Output/plots/marker_profiles.png")

p <- PlotMarkerProfiles(protein_quant_am_no_glyco_yes_rbp, "markers", keep_markers=marker_classes,
                        organelle_order=organelle_order, unknown=TRUE) +
  facet_grid(zero_imputation_n~missing_n) +
  scale_colour_manual(values=m_colours, guide=FALSE) +
  theme(legend.position="none")
print(p)
ggsave("./Output/plots/marker_profiles_imputation.png")
```

Now we make a new set of markers designed to highlight functional groups of RBPs more usefully. First we need to define new sets of GO annotation proteins, where each marker belongs to only one group
```{r}
translocon <- human_go %>% filter(GO.ID=='GO:0071256') %>% pull(UNIPROTKB)
para <- human_go %>% filter(GO.ID=='GO:0042382') %>% pull(UNIPROTKB)
mrna_splicing <- human_go %>% filter(GO.ID=='GO:0000398') %>% pull(UNIPROTKB)
translation_init <- human_go %>% filter(GO.ID=='GO:0006413') %>% pull(UNIPROTKB)
translation_init <- setdiff(translation_init, names(markers_proteins)[markers_proteins=="RIBOSOME"])

cell_cell_adhesion <- human_go %>% filter(GO.ID=='GO:0098609') %>% pull(UNIPROTKB)
cytoskeleton <- human_go %>% filter(GO.ID=='GO:0005856') %>% pull(UNIPROTKB)
motor_activity <- human_go %>% filter(GO.ID=='GO:0003774') %>% pull(UNIPROTKB)
er_stress_response <- human_go %>% filter(GO.ID=='GO:0030968') %>% pull(UNIPROTKB)
nuclear_pore_channel <- human_go %>% filter(GO.ID=='GO:0044613') %>% pull(UNIPROTKB)
nuclear_pore_basket <- human_go %>% filter(GO.ID=='GO:0044615') %>% pull(UNIPROTKB)
tRNA_AA <- human_go %>% filter(GO.ID=='GO:0004812') %>% pull(UNIPROTKB)

#mrna_splicing <- setdiff(mrna_splicing, c(para, translation_init, cell_cell_adhesion, cytoskeleton, motor_activity))
#translation_init <- setdiff(translation_init, c(para, cell_cell_adhesion, cytoskeleton, motor_activity))
#cytoskeleton <- setdiff(cytoskeleton, c(para, cell_cell_adhesion, motor_activity))
#cell_cell_adhesion <- setdiff(cell_cell_adhesion, c(para, motor_activity))

#all_markers <- c(mrna_splicing, para, translocon, translation_init, cell_cell_adhesion, cytoskeleton, motor_activity)
#print(length(all_markers)==length(unique(all_markers)))


mrna_splicing <- setdiff(mrna_splicing, tRNA_AA)

all_markers <- c(mrna_splicing, tRNA_AA)
print(length(all_markers)==length(unique(all_markers)))

```





```{r}

rbps_markers <- markers_proteins
localisations_to_remove <- c("PEROXISOME", "PROTEASOME", "GOLGI", "LYSOSOME")

rbps_markers <- rbps_markers[!rbps_markers %in% localisations_to_remove]
rbps_markers[rbps_markers =="NUCLEUS-CHROMATIN"] <- "NUCLEUS"

print(table(rbps_markers))


new_markers <- c(#rep("PARASPECKLES", length(para)),
                 rep("mRNA splicing", length(mrna_splicing)),
                 rep("Aminoacyl-tRNA ligase", length(tRNA_AA)),
                 "PARP1"#,
                 #rep("TRANSLATION INITITAION", length(translation_init)),
                 #rep("CELL-CELL ADHESION", length(cell_cell_adhesion)),
                 #rep("MOTOR", length(motor_activity))#,
                 #rep("CYTOSKELETON", length(cytoskeleton))
                 )

names(new_markers) <- c(#para,
                        mrna_splicing,
                        tRNA_AA,
                        "P09874"#,
                        #translation_init,
                        #cell_cell_adhesion,
                        #motor_activity#,
                        #cytoskeleton
                        )
print(table(names(new_markers))[table(names(new_markers))>1])

rbps_markers <- rbps_markers[!names(rbps_markers) %in% names(new_markers)]

combined_markers <- c(rbps_markers, new_markers)
print(table(combined_markers))
print(table(names(combined_markers))[table(names(combined_markers))>1])

fData(protein_quant_am_no_glyco_yes_rbp)$new_markers <- NULL
protein_quant_am_no_glyco_yes_rbp <- addMarkers(protein_quant_am_no_glyco_yes_rbp, combined_markers, "new_markers")

fData(protein_quant_am_no_glyco_yes_rbp)$new_markers <- recode(
  fData(protein_quant_am_no_glyco_yes_rbp)$new_markers, "NUCLEUS"="Nucleus", "RIBOSOME"="Ribosome", "CYTOSOL"="Cytosol",
  "MITOCHONDRIA"="Mitochondria") 

print(table(fData(protein_quant_am_no_glyco_yes_rbp)$new_markers))
```
```{r}

```


After adding these new RBP markers, we only have 1 PM and 2 Mt proteins remaining. Let's remove the PM protein
```{r}
fData(protein_quant_am_no_glyco_yes_rbp)$new_markers[fData(protein_quant_am_no_glyco_yes_rbp)$new_markers=="PM"] <- "unknown"
```

```{r}
new_markers_levels <- getMarkerClasses(protein_quant_am_no_glyco_yes_rbp, "new_markers")

p <- plotPCA(protein_quant_am_no_glyco_yes_rbp, "new_markers", re_order_markers=TRUE, marker_levels=new_markers_levels,
        m_colours=getClassColours()[c(1:5, 7, 10:20)], foi=nuclear_pore_channel)
print(p)


p <- plotPCA(protein_quant_am_no_glyco_yes_rbp, "new_markers", re_order_markers=TRUE, marker_levels=new_markers_levels,
        m_colours=getClassColours()[c(1:5, 7, 10:20)], foi=nuclear_pore_basket)
print(p)

```


```{r}
getMarkerClasses(protein_quant_am_no_glyco_yes_rbp, "new_markers")

set.seed(1)
proj <- make_proj("t-SNE", protein_quant_am_no_glyco_yes_rbp, "new_markers")
```

```{r}
library(Hmisc)
```

```{r}
proj_df <- proj$PCA_df
marker_levels <- setdiff(new_markers_levels, "unknown")
marker_levels <- marker_levels[c(2,3,4,6,8,1,5,7)]
#m_colours <- getStockcol()[c(1,7,4,3,2,5)]
m_colours <- c(cbPalette[c(6,3,2,4,8,7,5)], "grey20")

proj_df$markers <- factor(proj_df$new_markers, levels=marker_levels)
print(table(is.na(proj_df$markers)))
proj_df$unknown <- proj_df$new_markers=="unknown"

p <- ggplot(proj_df, aes(X, Y, colour=markers)) +
    geom_point(aes(X, Y, colour=markers, alpha=unknown), size=2) +
    scale_alpha_manual(values=c(1,0.2), guide=F) +
    my_theme +
    scale_colour_manual(values=m_colours, na.value="grey60", name="", breaks=c(marker_levels)) +
    guides(colour = guide_legend(override.aes = list(alpha = 1, size=2))) +
    xlab("Dimension 1") + ylab("Dimension 2")

print(p)
ggsave("./Output/tSNE_RBP_markers.png")
write.table(proj_df, "./Output/tSNE_projections.tsv", sep="\t", quote=FALSE, row.name=FALSE)


```
```{r}
p <- PlotMarkerProfiles(protein_quant_am_no_glyco_yes_rbp, "new_markers",
                        keep_markers=getMarkerClasses(protein_quant_am_no_glyco_yes_rbp, "new_markers"), organelle_order=new_markers_levels) +
  facet_wrap(~markers) +
  scale_colour_manual(values=getClassColours()[c(1:5, 7, 10:20)], guide=FALSE) +
  theme(legend.position="none")
print(p)
```
```{r}

protein_info <- read.delim("Input/Aggregated-proteins-2187-with-uniprot.tab") %>%
  dplyr::select(Entry, gene_name=Gene.names...primary..)

total.prot = readRDS("./Input/prot_res_20_fractions_imputed_markers.rds")
colnames(total.prot)[12] <- "0.948"
colnames(total.prot) <- c(seq(1,20,2), seq(2,20,2))
total.prot <- total.prot[,order(as.numeric(as.character(colnames(total.prot))))]
total.prot <- total.prot[,5:19]
total.prot = normalise(total.prot,"sum")

rbp.prot <- protein_quant_am_no_glyco_yes_rbp
colnames(rbp.prot) <- 5:19

print(dim(total.prot))
print(dim(rbp.prot))

print(colnames(total.prot))
print(colnames(rbp.prot))

```



```{r}
plotCombinedProfiles <- function(foi){
  
  foi_in_total <- intersect(foi, rownames(total.prot))
  foi_in_rbp <- intersect(foi, rownames(rbp.prot))
  
  foi_in_both <- intersect(foi_in_rbp, foi_in_total)
  
  print(foi_in_both)
  if(length(foi_in_both)==0){
    return(NA)
  }
  
  total_exprs_df <- exprs(total.prot[foi_in_both,])
  total_exprs_df <- melt(total_exprs_df)
  total_exprs_df$type = "All protein"
  
  rbp_exprs_df <- exprs(rbp.prot[foi_in_both,])
  rbp_exprs_df <- melt(rbp_exprs_df)
  rbp_exprs_df$type = "RNA-bound"
  
  #print(head(total_exprs_df))
  #print(head(rbp_exprs_df))
  #print(head(rbind(rbp_exprs_df, total_exprs_df)))
  
  p <- rbind(rbp_exprs_df, total_exprs_df) %>%
    merge(protein_info, by.x="Var1", by.y="Entry") %>%
    ggplot(aes(Var2, value, colour=type, group=type)) +
    my_theme + geom_line() +
    theme(aspect.ratio=1, axis.text.x=element_text(angle=90, vjust=0.5, hjust=1))  +
    xlab("Fraction") + ylab("Sumn normalised\nabundance") + 
    scale_colour_discrete(name="", na.value="grey")
  
  if(length(foi_in_both)>1){
    p <- p + facet_wrap(~gene_name)
  }
  
  print(p)
  
  p2 <- p <- rbind(rbp_exprs_df, total_exprs_df) %>%
    merge(protein_info, by.x="Var1", by.y="Entry") %>%
    ggplot(aes(Var2, value, colour=type, group=interaction(type, gene_name))) +
    my_theme + geom_line() +
    theme(aspect.ratio=1, axis.text.x=element_text(angle=90, vjust=0.5, hjust=1))  +
    xlab("Fraction") + ylab("Sumn normalised\nabundance") + 
    scale_colour_discrete(name="", na.value="grey")
  
  print(p2)
  
  invisible(list("p"=p, "p2"=p2))
}
```


```{r}
p <- plotCombinedProfiles(para)
ggsave("Output/plots/paraspeckle.png", p$p2)
```

```{r, fig.width=10, fig.height=10}
plotCombinedProfiles(tRNA_AA)

```

```{r, fig.width=10, fig.height=10}
plotCombinedProfiles(motor_activity)
```

```{r}
fvarLabels(rbp.prot)
well_quantified_rbps <- fData(rbp.prot) %>%
  filter(zero_imputation_n<=4, missing_n<=2) %>%
  pull(master_protein)

plotCombinedProfiles(intersect(mrna_splicing, well_quantified_rbps))
```

```{r}
ribosome_proteins <- names(markers_proteins)[markers_proteins=="RIBOSOME"]
plotCombinedProfiles(intersect(ribosome_proteins, well_quantified_rbps))
```

```{r}
plotCombinedProfiles("Q15942")
```

```{r}
p <- plotCombinedProfiles("P09874")

ggsave("./Output/PARP1_profiles.png", p$p2)

```

